# load packages and data --------------------------------------------------
library(haven)
library(tidyverse)
library(dplyr)
library(psych)
library(stargazer)
library(lme4)
library(performance)
library(nortest)

library("intsvy")
library("dplyr")
library("ggplot2")
library("tidyr")
library(BIFIEsurvey)
library(HLMdiag)

root <- "C:/Users/Wieczorek_W_Station/Dropbox/Projekte/Effektive_Schulsteuerung_PISA/Daten/PISA/"
#root <- "C:/Users/olive/Dropbox/Projekte/Effektive_Schulsteuerung_PISA/Daten/PISA/"

path <- paste0(root,"Aufbereitete_Daten/")
output <- paste0(root,"Output_BIFIE/")
setwd(path)

y <-2009


# Define function calls ---------------------------------------------------


##Output generator
bifie_stats <- function(bifie_output){
  stats <- bifie_output$stat
  
  stats <- stats %>%
    select(parameter,est, SE,t,p) %>%
    as_tibble()
  
  stats <- stats %>%
    add_row(parameter = "no Students", est = bifie_output$N)
  
  stats <- stats %>%
    add_row(parameter = "no Schools", est = bifie_output$Nclusters)
  
  stats <- stats %>% 
    filter(substr(parameter,1,4)=="beta" |
             substr(parameter,1,3)=="ICC" |
             substr(parameter,1,2)=="R2" |
             substr(parameter,1,3) =="no "
    )#
  return(stats)
}
## Calculate Variance inflation Factor of IVs required for the Regression
## Analysis
vif_calculator <- function(bifie_postesitmation, original_dataframe){
  ##prepare the dataframe
  pred_pre <- bifie_postesitmation %>%
    filter(str_detect(parameter,"beta"))
  ##get the parameters
  parms <-  gsub("beta_","", pred_pre$parameter) %>%
    as.list()
  
  ## Create a dataframe based on the IVs
  ## But: check if an interaction term is present.
  ## Interaction terms contain the ":"-character.
  if(sum(str_count(pattern=":", as.vector(parms))) > 0){  
    ##create a reduced dataframe
    model_df_reduced <- original_dataframe[colnames(original_dataframe) %in% parms]
    model_df_reduced <- model_df_reduced %>%
      mutate(Z_ESCS_Z_MEAN_ESCS = Z_ESCS * Z_MEAN_ESCS)
  }  else{
    model_df_reduced <- original_dataframe[colnames(original_dataframe) %in% parms]
  }
  
  ##prepare data for further investigation
  Vars <- model_df_reduced %>%
    colnames() #Vector for variable names
  VIF <- c() # empty vector for VIFs
  
  vif_dataframe <- tibble(variables = Vars)
  
  ## If our dataframe got more than one variable, then perform VIF
  ## Otherwise: return NA-values
  if(nrow(vif_dataframe) >1){
    ## generate a negation to filter DV
    `%nin%` = Negate(`%in%`) 
    for(V in Vars){
      DV <- V
      IVS <- as.vector(Vars)
      IVS <- IVS[IVS %nin% DV]
      
      IV_formula <- ""
      checkpos <- length(IVS)
      pos <- 1
      
      for(i in IVS){
        if(pos < checkpos) {
          IV_formula <- paste0(IV_formula,i," + ")
          pos <- pos + 1 
        }
        else{
          IV_formula <- paste0(IV_formula,i)
        }#
      }
      
      ## cerate a formula for calculating the VIF
      m <- as.formula(paste(DV, "~", IV_formula))
      
      ## Regress one IV of the imputated model on other IVs
      vif_model <- summary(lm(m, data = model_df_reduced))
      
      ## calculate VIF as 1/(1-R?j_th variable)
      value <- as.numeric(1/(1-vif_model$r.squared))
      ##show model
      vif_model
      
      ##append VIF to variable
      VIF <- append(VIF,as.numeric(value))
      
    }
    
    ## generate VIF- Dataframe
    vif_dataframe <- vif_dataframe %>%
      mutate(VIF = VIF)
    
    ## Add mean VIF
    vif_dataframe <- vif_dataframe %>%
      add_row(variables = "MEAN_VIF", VIF = mean(VIF))
  } else{
    ## generate VIF- Dataframe
    vif_dataframe <- vif_dataframe %>%
      mutate(VIF = NA)
    
    ## Add mean VIF
    vif_dataframe <- vif_dataframe %>%
      add_row(variables = "MEAN_VIF", VIF = NA)
  }
  return(vif_dataframe)
}

# Calculate Error Terms and Shapiro-Francia Test ---------------------------
## predict values for DV
## try to fetch the error terms for each model
sf_calculator <- function(bifie_postestimation,original_dataframe){
  pred_pre <- bifie_postestimation %>% 
    filter(str_detect(parameter,"beta"))
  
  ## fetch intercept_value
  alpha <- pred_pre %>%
    filter(str_detect(parameter, "Intercept")) %>%
    select(est)
  alpha <- alpha$est
  
  ##get the parameters
  betas <-  gsub("beta_","", pred_pre$parameter) %>%
    as.vector()
  
  # exclude intercept from parameter-List
  pred_pre$parameter <- betas
  pred_pre <- pred_pre %>%
    filter(str_detect(betas, "Intercept", negate= TRUE))
  
  
  
  ## check for interaction terms
  interactions <- betas[str_detect(betas,":")]
  
  ## split interaction terms
  interactions_split <- str_split(interactions, pattern =":")
  
  ## reduce beta-values
  betas <- betas[str_detect(betas, "Intercept", negate= TRUE)]
  betas <- betas[str_detect(betas,interactions, negate=TRUE)]
  
  
  ##add DV-Variables to parameters
  dep ="MEAN"
  dv_parms <- original_dataframe %>%
    select(contains("MEAN") & contains("PV")) %>%
    colnames() 
  
  dv_iv_parms <- append(dv_parms,betas)
  
  model_df_reduced <- original_dataframe %>%
    select(dv_iv_parms)
  
  if(length(interactions > 0)){
    print(paste("interactions present for model:",deparse(substitute(bifie_postestimation))))
    for(i in length(interactions_split)){
      print(interactions_split[[i]])
      interaction_variables = interactions_split[[i]]
      model_df_reduced[paste0(interaction_variables[1],
                              "_",
                              interaction_variables[2])
      ] <- model_df_reduced[interaction_variables[1]] * 
        model_df_reduced[interaction_variables[2]]
      
      ## add interaction term to betas
      
      betas <- append(betas,paste0(interaction_variables[1],
                                   "_",
                                   interaction_variables[2]))
    }
  }
  
  pred_pre$parameter <- str_replace(as.vector(pred_pre$parameter),":","_")
  
  ## calculate beta-values * values given per model
  for(b in betas){
    beta_value <- pred_pre %>%
      filter(parameter == b) %>%
      select(est) %>%
      as.numeric()
    varname_new <- paste0(b,"_predict")
    model_df_reduced[varname_new] <- model_df_reduced[b] * beta_value
    
  }
  model_df_reduced["prediction"] <- alpha + 
    rowSums(
      model_df_reduced[
        str_detect(
          colnames(model_df_reduced), "_predict")
      ]
    )
  ##create an empty list to store error-terms
  
  
  dvs <- model_df_reduced %>%
    select(starts_with("PV") & ends_with("MEAN")) %>%
    colnames()
  
  ## generate distances over all plausible values
  
  error_count <- 1
  for(d in dvs){
    
    model_df_reduced[
      paste0("ERRORS_",as.character(error_count)
      )
    ] <- model_df_reduced["prediction"] - model_df_reduced[d]
    
    
    error_count <- error_count +1
  }
  
  model_df_reduced["MEAN_ERROR"] <- model_df_reduced %>%
    select(starts_with("ERROR")) %>%
    rowMeans()
  
  errors <- model_df_reduced$MEAN_ERROR
  
  sampling <- NA
  if(length(errors) <=5000){
    sf_testvalue <- sf.test(errors)
    sampling <- "no"
  } else{
    seed = 42
    sf_testvalue <- sf.test(sample(errors, size = 5000))
    sampling <- "yes"
  }
  
  sf_testvalue <- tibble(model = deparse(substitute(bifie_postestimation)),
                         W = sf_testvalue$statistic,
                         P = sf_testvalue$p.value,
                         imputations = length(dvs),
                         obs = nrow(original_dataframe),
                         sampling = sampling)
  
  ##H0 = Variable follows a normal distribution
  ##H1 = Variable does not follow a normal distribution
  if(sf_testvalue$P < 0.05){
    sf_testvalue <- sf_testvalue %>% 
      mutate(normally_distributed = "no")
  }else{
    sf_testvalue<- sf_testvalue %>% 
      mutate(normally_distributed = "yes")
  }
  
  return(sf_testvalue)
}

# Load Country Dataframe for 2009 -----------------------------------------
#countries <- as.list(unique(countries))

#countries <- c("DEU","GBR","USA","CAN","FIN","SWE", "AUS","CHE","FRK")
countries <- c("KOR", "SGP","USA","CAN")


for(country in countries){
  print(paste("currently at:",country))
  
  
  path_year <- paste0(path,"2009","/")
  setwd(path_year)
  
  
  df <- read.csv(paste0(country,"with_indices_2009.csv")) 
  df <- as_tibble(df)
  df <- df %>% 
    select(-starts_with("X"))
  
  ## rename variables
  df <- df %>% 
    rename(isei = bfmj)

  for(i in seq(1,5)){
    df[paste0("pv",as.character(i),"mean")] <- rowMeans(df[c(paste0("pv",as.character(i),"math"),
         paste0("pv",as.character(i),"read"),
         paste0("pv",as.character(i),"scie"))], na.rm = T)
  }
  
  for(i in seq(1,5)){
    df[paste0("pv",as.character(i),"read_scie")] <- rowMeans(df[c(paste0("pv",as.character(i),"read"),
                                                             paste0("pv",as.character(i),"scie"))], na.rm = T)
  }
 

# Select Variables relevant for regression analysis -----------------------
  
  
  model_df <-  df %>%
    select(ends_with("mean"), ends_with("read_scie"), escs , mean_escs ,  mig_2nd , mean_mig , langn,
             disclima , mean_disclima, student_absenteeism , teacher_absenteeism ,
             autonomy , leadership , accountability ,schoolid, w_fstuwt, stidstd,  schoolid,
           starts_with("w_f")) %>% 
    na.omit()
  

# Change Column Names to upper --------------------------------------------
  colnames(model_df) <- model_df %>%
    colnames() %>%
    toupper()  
  


# z-standardize variables -------------------------------------------------

  model_df <- model_df %>% mutate(Z_ESCS = ESCS - mean(ESCS) / sd(ESCS),
                      Z_MEAN_ESCS = MEAN_ESCS - mean(MEAN_ESCS) / sd(MEAN_ESCS),
                      Z_MEAN_MIG = MEAN_MIG -  mean(MEAN_MIG) / sd(MEAN_MIG),
                      Z_DISCLIMA = DISCLIMA - mean(DISCLIMA) / sd(DISCLIMA),
                      Z_MEAN_DISCLIMA = MEAN_DISCLIMA - mean(MEAN_DISCLIMA) / sd(MEAN_DISCLIMA),
                      Z_TEACHER_ABSEENTEISM = TEACHER_ABSENTEEISM - mean(TEACHER_ABSENTEEISM) / sd(TEACHER_ABSENTEEISM),
                      Z_STUDENT_ABSEENTEISM = STUDENT_ABSENTEEISM - mean(STUDENT_ABSENTEEISM) / sd(STUDENT_ABSENTEEISM),
                      Z_AUTONOMY = AUTONOMY - mean(AUTONOMY) / sd(AUTONOMY),
                      Z_LEADERSHIP = LEADERSHIP -mean(LEADERSHIP) / sd(LEADERSHIP),
                      Z_ACCOUNTABILITY = ACCOUNTABILITY - mean(ACCOUNTABILITY) / sd(ACCOUNTABILITY) 
                      )
  
  model_df <-  model_df %>%
      select(starts_with("Z") , ends_with("mean"), contains("read_sci"), MIG_2ND, LANGN, MEAN_MIG,
             ends_with("TEEISM"), W_FSTUWT, starts_with("W_FSTR"),
             SCHOOLID)
    
# Generate BIFIE-Data -----------------------------------------------------
  RR <- 80
  set.seed(42)  

  test <- BIFIE.data.jack(as.data.frame(model_df), jktype = "RW_PISA",
                          pv_vars = c("MEAN","READ_SCIE"),
                          wgtrep="W_FSTR",
                          pvpre = paste0("PV",1:5),
                          fayfac=1 / RR / ( 1 - .5 )^2 )
  
  bifie_data <- test
  
  if(country == "SGP"){
  
    print("calculate model 0")
  m0 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~ 1, 
                         formula.random = ~ 1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)
  
  print("calculate model 1")
  m1 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_ESCS,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)
  
  print("calculate model 2")
  m2 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_MEAN_ESCS,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)
  
  print("calculate model 3")
  m3 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE) 
  
  print("calculate model 4")
  m4 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + MIG_2ND  + Z_MEAN_MIG + LANGN,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE) 
  
  print("calculate model 5")
  m5 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_DISCLIMA + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)  
  
  print("calculate model 6")
  m6 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)  
  

  print("calculate model 9")
  m9 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS + Z_MEAN_MIG +
                           MIG_2ND + LANGN + 
                           Z_DISCLIMA + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM +
                           Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2= "one",
                         maxiter=1000, se = TRUE)
  }else{
    
    print("calculate model 0")
    m0 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1, 
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           wgtlevel2="one",
                           maxiter=1000, se = TRUE)
    
    print("calculate model 1")
    m1 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    
    print("calculate model 2")
    m2 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_MEAN_ESCS,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    
    print("calculate model 3")
    m3 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE) 
    
    print("calculate model 4")
    
    m4 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + MIG_2ND  + Z_MEAN_MIG + LANGN,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE) 
    
    print("calculate model 5")
    if(country != "CAN"){
    m5 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_DISCLIMA + 
                             Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)  
    }else{
      m5 <-BIFIE.twolevelreg(BIFIEobj = test,
                             dep = "READ_SCIE",
                             idcluster =  "SCHOOLID",
                             formula.fixed = ~1 + Z_DISCLIMA + 
                               Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM,
                             formula.random = ~1 ,
                             wgtlevel1 <- "W_FSTUWT",
                             maxiter=1000, se = TRUE)  
      
    }
    
    print("calculate model 6")
    m6 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)  
    

    
    print("calculate model 9")
    if(country != "CAN"){
    m9 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS + Z_MEAN_MIG +
                             MIG_2ND + LANGN + 
                             Z_DISCLIMA  + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM+ 
                             Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    }else{
      m9 <-BIFIE.twolevelreg(BIFIEobj = test,
                             dep = "READ_SCIE",
                             idcluster =  "SCHOOLID",
                             formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS + Z_MEAN_MIG +
                               MIG_2ND + LANGN + 
                               Z_DISCLIMA  + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM+ 
                               Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                             formula.random = ~1,
                             wgtlevel1 <- "W_FSTUWT",
                             maxiter=1000, se = TRUE)
    }
  }

  
  # Generate Outputs --------------------------------------------------------
  
  setwd(output)
  
  ##generate statistics for export (fixed effects)
  stats_m0 <- bifie_stats(m0)
  stats_m1 <- bifie_stats(m1)
  stats_m2 <- bifie_stats(m2)
  stats_m3 <- bifie_stats(m3)
  stats_m4 <- bifie_stats(m4)
  stats_m5 <- bifie_stats(m5)
  stats_m6 <- bifie_stats(m6)
  stats_m9 <- bifie_stats(m9)
  
  ##generate statistics for export (random effects)
  
  random_effects_m1 <- as_tibble(m1$stat) ## extract stats
  random_effects_m2 <- as_tibble(m2$stat) ## extract stats
  random_effects_m3 <- as_tibble(m3$stat) ## extract stats
  random_effects_m4 <- as_tibble(m4$stat) ## extract stats
  random_effects_m5 <- as_tibble(m5$stat) ## extract stats
  random_effects_m6 <- as_tibble(m6$stat) ## extract stats
  random_effects_m9 <- as_tibble(m9$stat) ## extract stats
  
  
  ## get list of variance components for random effects models
  random_effects_m1 <- random_effects_m1 %>%
    filter(str_detect(parameter,"^Var_")) %>%
    filter(!str_detect(parameter,"Total"))
  
  random_effects_m2 <- random_effects_m2 %>%
    filter(str_detect(parameter,"^Var_")) %>%
    filter(!str_detect(parameter,"Total"))
  
  random_effects_m3 <- random_effects_m3 %>%
    filter(str_detect(parameter,"^Var_")) %>%
    filter(!str_detect(parameter,"Total"))  
  
  random_effects_m4 <- random_effects_m4 %>%
    filter(str_detect(parameter,"^Var_")) %>%
    filter(!str_detect(parameter,"Total")) 
  
  random_effects_m5 <- random_effects_m5 %>%
    filter(str_detect(parameter,"^Var_")) %>%
    filter(!str_detect(parameter,"Total")) 
  
  random_effects_m6 <- random_effects_m6 %>%
    filter(str_detect(parameter,"^Var_")) %>%
    filter(!str_detect(parameter,"Total")) 
  
  random_effects_m9 <- random_effects_m9 %>%
    filter(str_detect(parameter,"^Var_")) %>%
    filter(!str_detect(parameter,"Total")) 

# Combine statstics -------------------------------------------------------
  varnames_statistics <- c()
  
  
  ## variables and variable names for fixed effects
  for(var in stats_m9$parameter){
    varnames_statistics <- append(varnames_statistics,var)
    varnames_statistics <- append(varnames_statistics,paste0(var,"_SD"))
  }
  
  ## variables and variable names for random effects
  varnames_statistics <- append(varnames_statistics, random_effects_m9$parameter)
  
  
  
  export_data <- as.tibble(varnames_statistics) %>% 
    rename("parameter" = "value") %>%
    filter(parameter != "no Students_SD") %>%
    filter(parameter != "no Schools_SD")
  
  test <- cbind(stats_m2, random_effects_m2)
  
  ## Create Estimates with stars for fixed and random effects
  models <- list(stats_m0, 
                 stats_m1, 
                 stats_m2, 
                 stats_m3,
                 stats_m4, 
                 stats_m5, 
                 stats_m6, 
                 stats_m9)
  
  model_count <- 0
  for(MODEL in models){
    
    ## generate model name
    MODEL_NAME <- paste0("Model_",as.character(model_count))
    
    ## generate table with parameter values and p-values
    stats_est <- MODEL %>%
      select(parameter, est,p) %>% 
      mutate(est = as.character(round(est, digits=3)),
             p = round(p, digits=3)) %>%
      mutate(p = case_when(p < 0.001 ~ "***",
                           p < 0.01 ~"**",
                           p < 0.05 ~ "*",
                           p >= 0.05 ~ "",
                           p == NA ~ ""))
    
    stats_est <-stats_est %>%
      unite(est, est,p, na.rm =T, sep="")
    
    
    colnames(stats_est) <- c("parameter", MODEL_NAME)
    
    ## generate standard error estimations
    
    stats_se <- MODEL %>% 
      select(parameter,SE) %>%
      mutate(SE = paste0("(", 
                         as.character(round(SE,digits=3)),
                         ")")) 
    colnames(stats_se) <- c("parameter", MODEL_NAME)
    
    
    ## rename Standard Error parameters
    PARMS <- c()
    for(p in stats_se$parameter){
      PARMS <- append(PARMS, paste0(p,"_SD"))
    }
    stats_se$parameter <- PARMS
    rm(PARMS)
    
    
    export_data <- export_data %>%
      left_join(rbind(stats_se,stats_est), by = c("parameter"))
    model_count <- model_count +1
  }
  
  
  
  xlsx::write.xlsx(export_data,paste0("2009_sci_reading_",country,".xlsx"),
                   showNA=FALSE)
  
  
  
  vif_m1 <- vif_calculator(stats_m1,model_df) %>%
    rename(VIF_m1=VIF)
  
  vif_m2 <- vif_calculator(stats_m2,model_df) %>%
    rename(VIF_m2=VIF)
  
  vif_m3 <- vif_calculator(stats_m3,model_df) %>%
    rename(VIF_m3=VIF)
  
  vif_m4 <- vif_calculator(stats_m4,model_df) %>%
    rename(VIF_m4=VIF)
  
  vif_m5 <- vif_calculator(stats_m5,model_df) %>%
    rename(VIF_m5=VIF)
  
  vif_m6 <- vif_calculator(stats_m6,model_df) %>%
    rename(VIF_m6=VIF)

  
  vif_m9 <- vif_calculator(stats_m9,model_df) %>%
    rename(VIF_m9 = VIF)
  
  vif_dataframe <- vif_m9 %>% 
    left_join(vif_m1) %>%
    left_join(vif_m2) %>%
    left_join(vif_m3) %>%
    left_join(vif_m4) %>%
    left_join(vif_m5) %>%
    left_join(vif_m6) %>%
    relocate(VIF_m9, .after = last_col())
  
  
  vif_dataframe <-    vif_dataframe %>%
    slice(3,4,nrow(vif_dataframe)-1,1,5,2,6:10, nrow(vif_dataframe))
  
  vars <- vif_dataframe %>%
    pull(variables)
  
  vif_dataframe <- as.data.frame(lapply(vif_dataframe, function(x) round(as.numeric(x), 3)))
  vif_dataframe$variables <- vars
  

  xlsx::write.xlsx(vif_dataframe,paste0(as.character(y),"_sci_reading_",country,"_VIF.xlsx"),
                   showNA=FALSE)
  
  ##Calculate Shapiro-Francia-Tests
  shapiros <- sf_calculator(stats_m1,model_df) %>%
    add_row(sf_calculator(stats_m2,model_df)) %>%
    add_row(sf_calculator(stats_m3,model_df)) %>%
    add_row(sf_calculator(stats_m4,model_df)) %>%
    add_row(sf_calculator(stats_m5,model_df)) %>%
    add_row(sf_calculator(stats_m6,model_df)) %>%
    add_row(sf_calculator(stats_m9,model_df))
  xlsx::write.xlsx(shapiros,paste0(as.character(y),"_sci_reading_",country,"_sf-test.xlsx"),
                   showNA=FALSE)
  


# Create the resdualplots -------------------------------------------------
  
  
  ## create a dataframe containing the estimators of the full model
  estimators_m9 <- stats_m9 %>%
    filter(str_detect(parameter, "beta")) %>%
    select(parameter,est)
  
  #change parameter
  estimators_m9$parameter <- estimators_m9 %>%
    pull(parameter) %>%
    str_replace("beta_","") %>%
    str_replace(":","_") %>%
    str_replace_all(c("\\(" = "", "\\)" = ""))
  
  #get parameters list
  parameters_list <- estimators_m9 %>%
    pull(parameter) 
  
  ## get the data to calculate the residuals
  residuals_data <- bifie_data$dat1 %>%
    unique() %>%
    as_tibble() %>%
    mutate(Z_ESCS_Z_MEAN_ESCS = Z_ESCS * Z_MEAN_ESCS) %>%
    mutate(Z_ESCS_Z_MEAN_ESCS = Z_ESCS_Z_MEAN_ESCS - mean(Z_ESCS_Z_MEAN_ESCS) / sd(Z_ESCS_Z_MEAN_ESCS) ) %>%
    mutate(Intercept = 1)
  
  prediction_step1 <- residuals_data %>%
    select(estimators_m9$parameter)
  
  ## multiply each variable by the predictor and sum afterwards
  i <- 1
  while(i <= length(parameters_list)){
  prediction_step1[parameters_list[i]] <- prediction_step1 %>%
    select(parameters_list[i]) *
    estimators_m9 %>%
    filter(parameter == parameters_list[i]) %>%
      pull(est)
  
  print(paste(parameters_list[i],
              i,
    head(prediction_step1[parameters_list[i]]))
      )
  
  i <- i + 1
  }
  
  ## make a rowwise dataframe
  prediction_step2 <- prediction_step1 %>%
    mutate(id = sequence(1,nrow(prediction_step1))) %>%
    rowwise(id)
  prediction_step2 <- prediction_step2 %>%
    mutate(predict = sum(c_across()))
  
  
  ## attach the imputed mean reading and science scores
  prediction_step2["READ_SCIE"] <- residuals_data %>%
    pull(READ_SCIE)
  
  ## calculate the differences
  prediction_step2 <- prediction_step2 %>%
    mutate(pred_diff = READ_SCIE - predict)
  
  
  ## create a plot with measured versus predicted values
  predict_plot <- ggplot(prediction_step2, aes(x=READ_SCIE,y=predict)) +
    geom_point(alpha = 0.3, color="steelblue") + 
    stat_density_2d(aes(fill = ..level..), geom="polygon", alpha = 0.3) +
    geom_smooth(method=lm, color = "black")
  
  ## add a heat map by kernel density plot
  predict_plot <- predict_plot + 
    xlab("Measured: imputed PISA mean scores of science and reading") +
    ylab("Predicted: imputed PISA mean scores\nof science and reading") +
    ggtitle(paste("Full model:",country,"2009")) +
    theme(plot.title = element_text(hjust = 0.5))
  
  ## create a histogram with the distribution of residuals of the full model
  histogram <- ggplot(prediction_step2, aes(x = pred_diff,
                               y =..density..)) +
    geom_histogram(binwidth = 20, 
                   fill = "steelblue",
                   color = "black",
                   alpha = 0.5) +
    geom_density(fill = "steelblue",
                 color = "black",
                 alpha = 0.5) +
    geom_vline(xintercept = 0, linetype=2)
  
  histogram <- histogram + 
    xlab("Residuals: imputed PISA mean scores of science and reading") + 
    ggtitle(paste("Full model:",country, 2009)) +
    theme(plot.title = element_text(hjust = 0.5))
  
  ## save the plots
  ggsave(plot = predict_plot, 
         filename = paste0("prediction_plot_full_model",country,"_2009.png.png"),
         dpi = 300,
         units= "cm",
         width = 15,
         height = 10)
  
  ggsave(plot = histogram, 
         filename = paste0("residuals_histogram_full_model",country,"_2009.png"),
         dpi = 300,
         units= "cm",
         width = 15,
         height = 10)
} 

